suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(plotly)
library(RColorBrewer)
library(scatterplot3d)
library(tidyverse)
library(tximport)
library(vsn)
})
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- readr::read_csv(here("doc/Samples.csv"),
col_types = cols(
col_character(),
col_factor(),
col_factor(),
col_factor(),
col_character()
))
tx2gene translation table
tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.tsv.gz"),delim="\t",
col_names=c("TXID","GENE")))
filelist <- list.files(here("data/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Sanity check to ensure that the data is sorted according to the sample info
samples <- samples[match(sub("_sortmerna_trimmomatic", "", basename(dirname(filelist))),
samples$sampleID),]
stopifnot(all(match(sub("_sortmerna_trimmomatic", "", basename(dirname(filelist))),
samples$sampleID) == 1:nrow(samples)))
name the file list vector
names(filelist) <- samples$Name
Read the expression at the gene level
counts <- suppressMessages(round(tximport(files = filelist,
type = "salmon",
tx2gene = tx2gene)$counts))
counts <- subset(counts, select = -c(X_S339, P_S339))
samples <- samples[-c(8, 18),]
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts), digits = 1),
sum(sel),
nrow(counts))
## [1] "25.3% percent (9378) of 37075 genes are not expressed"
dat <- tibble(x = colnames(counts), y = colSums(counts)) %>%
bind_cols(samples)
ggplot(dat, aes(x, y, fill = samples$Flexure)) + geom_col() +
scale_y_continuous(name = "reads") +
theme(axis.text.x = element_text(angle = 90, size = 4), axis.title.x = element_blank())
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(data.frame(value = log10(rowMeans(counts))), aes(x = value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name = "mean raw counts (log10)")
## Warning: Removed 9378 rows containing non-finite values (stat_density).
The same is done for the individual samples.
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Condition = samples$Flexure[match(ind, samples$Name)]) %>%
mutate(Tissue = samples$Tissue[match(ind, samples$Name)])
ggplot(dat, aes(x = values, group = ind, col = Tissue)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name ="per gene raw counts (log10)")
## Warning: Removed 240818 rows containing non-finite values (stat_density).
dir.create(here("data/analysis/salmon"), showWarnings = FALSE, recursive = TRUE)
write.csv(counts, file = here("data/analysis/salmon/raw-unormalised-gene-expression_data-S339.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ TW + Flexure)
## converting counts to integer mode
colnames(dds) <- as.character(colData(dds)$Name)
saveRDS(dds, file = here("data/analysis/salmon/dds-S339.rds"))
save(dds, file = here("data/analysis/salmon/dds-S339.rda"))
Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P_R327 | P_R301 | P_R93 | P_R275 | P_R249 | P_S131 | P_S235 | P_S105 | P_S287 |
|---|---|---|---|---|---|---|---|---|
| 1.21 | 1.148 | 1.227 | 1.238 | 1.259 | 1.151 | 1.29 | 1.261 | 1.281 |
| X_R249 | X_R327 | X_R301 | X_R93 | X_R275 | X_S131 | X_S235 | X_S105 | X_S287 |
|---|---|---|---|---|---|---|---|---|
| 0.8302 | 0.7911 | 0.7843 | 0.7906 | 0.8315 | 0.9091 | 0.893 | 0.8527 | 0.7554 |
boxplot(sizes, main = "Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
saveRDS(vst, file = here("data/analysis/salmon/vst-S339.rds"))
write.csv(vst, file = here("data/analysis/salmon/normalised-gene-expression_data-S339.csv"))
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(vst) > 0, ])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2, ]*100)
We define the number of variable of the model
nvar = 2
An the number of possible combinations
nlevel = nlevels(dds$Tissue) * nlevels(dds$Flexure) # 4 levels
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x = 1:length(percent), y = cumsum(percent)), aes(x = x, y = y)) +
geom_line() + scale_y_continuous("variance explained (%)", limits = c(0, 100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept = nvar, colour = "red", linetype = "dashed", size = 0.5) +
geom_hline(yintercept = cumsum(percent)[nvar], colour = "red", linetype = "dashed", size = 0.5) +
geom_vline(xintercept = nlevel, colour = "orange", linetype = "dashed", size = 0.5) +
geom_hline(yintercept = cumsum(percent)[nlevel], colour = "orange", linetype = "dashed", size = 0.5)
## Warning: Removed 5 row(s) containing missing values (geom_path).
mar = c(5.1, 4.1, 4.1, 2.1)
The PCA shows that a large fraction of the variance is explained by both variables.
scatterplot3d(pc$x[, 1],
pc$x[, 2],
pc$x[, 3],
xlab = paste("Comp. 1 (", percent[1], "%)", sep = ""),
ylab = paste("Comp. 2 (", percent[2], "%)", sep = ""),
zlab = paste("Comp. 3 (", percent[3], "%)", sep = ""),
color = pal[as.integer(dds$Tissue)],
pch = c(17:19)[as.integer(dds$Flexure)])
legend("topleft",
fill = pal[1:nlevels(dds$Tissue)],
legend = levels(dds$Tissue))
legend("topright",
pch = 17:19,
legend = levels(dds$Flexure))
par(mar = mar)
pc.dat <- bind_cols(PC1 = pc$x[, 1],
PC2 = pc$x[, 2],
samples)
p <- ggplot(pc.dat, aes(x = PC1, y = PC2, col = Tissue, shape = Flexure, text = Name)) +
geom_point(size = 2) +
ggtitle("Principal Component Analysis", subtitle = "variance stabilized counts")
ggplotly(p) %>%
layout(xaxis = list(title = paste("PC1 (", percent[1], "%)", sep = "")),
yaxis = list(title = paste("PC2 (", percent[2], "%)", sep = "")))
# add replicate number, plot number instead of shapes
Filter for noise
conds <- factor(paste(samples$Flexure))
sels <- rangeFeatureSelect(counts = vst,
conditions = conds,
nrep = 5)
vst.cutoff <- 2
par(mar=c(7, 4, 4, 2)+0.1)
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff + 1]], ]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X, method = "ward.D2")},
labRow = NA, trace = "none",
labCol = conds,
margins = c(12, 8),
srtCol = 45,
col = hpal)
plot(as.hclust(hm$colDendrogram), xlab = "", sub = "")
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.62.0 tximport_1.22.0
## [3] forcats_0.5.1 stringr_1.4.0
## [5] dplyr_1.0.8 purrr_0.3.4
## [7] readr_2.1.2 tidyr_1.2.0
## [9] tibble_3.1.6 tidyverse_1.3.1
## [11] scatterplot3d_0.3-41 RColorBrewer_1.1-2
## [13] plotly_4.10.0 pander_0.6.4
## [15] hyperSpec_0.100.0 xml2_1.3.3
## [17] ggplot2_3.3.5 lattice_0.20-45
## [19] here_1.0.1 gplots_3.1.1
## [21] DESeq2_1.34.0 SummarizedExperiment_1.24.0
## [23] Biobase_2.54.0 MatrixGenerics_1.6.0
## [25] matrixStats_0.61.0 GenomicRanges_1.46.1
## [27] GenomeInfoDb_1.30.1 IRanges_2.28.0
## [29] S4Vectors_0.32.3 BiocGenerics_0.40.0
## [31] data.table_1.14.2
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.4.1 lazyeval_0.2.2
## [4] splines_4.1.2 crosstalk_1.2.0 BiocParallel_1.28.3
## [7] digest_0.6.29 htmltools_0.5.2 fansi_1.0.2
## [10] magrittr_2.0.2 memoise_2.0.1 tzdb_0.2.0
## [13] limma_3.50.1 Biostrings_2.62.0 annotate_1.72.0
## [16] modelr_0.1.8 vroom_1.5.7 jpeg_0.1-9
## [19] colorspace_2.0-3 blob_1.2.2 rvest_1.0.2
## [22] haven_2.4.3 xfun_0.29 hexbin_1.28.2
## [25] crayon_1.5.0 RCurl_1.98-1.6 jsonlite_1.7.3
## [28] genefilter_1.76.0 survival_3.2-13 glue_1.6.1
## [31] gtable_0.3.0 zlibbioc_1.40.0 XVector_0.34.0
## [34] DelayedArray_0.20.0 scales_1.1.1 DBI_1.1.2
## [37] Rcpp_1.0.8 viridisLite_0.4.0 xtable_1.8-4
## [40] bit_4.0.4 preprocessCore_1.56.0 htmlwidgets_1.5.4
## [43] httr_1.4.2 ellipsis_0.3.2 pkgconfig_2.0.3
## [46] XML_3.99-0.9 farver_2.1.0 sass_0.4.0
## [49] dbplyr_2.1.1 locfit_1.5-9.4 utf8_1.2.2
## [52] tidyselect_1.1.1 labeling_0.4.2 rlang_1.0.0
## [55] AnnotationDbi_1.56.2 munsell_0.5.0 cellranger_1.1.0
## [58] tools_4.1.2 cachem_1.0.6 cli_3.2.0
## [61] generics_0.1.2 RSQLite_2.2.10 broom_0.7.12
## [64] evaluate_0.15 fastmap_1.1.0 yaml_2.3.5
## [67] knitr_1.37 bit64_4.0.5 fs_1.5.2
## [70] caTools_1.18.2 KEGGREST_1.34.0 brio_1.1.3
## [73] compiler_4.1.2 rstudioapi_0.13 png_0.1-7
## [76] testthat_3.1.2 affyio_1.64.0 reprex_2.0.1
## [79] geneplotter_1.72.0 bslib_0.3.1 stringi_1.7.6
## [82] highr_0.9 Matrix_1.3-4 vctrs_0.3.8
## [85] pillar_1.7.0 lifecycle_1.0.1 BiocManager_1.30.16
## [88] jquerylib_0.1.4 bitops_1.0-7 R6_2.5.1
## [91] latticeExtra_0.6-29 affy_1.72.0 KernSmooth_2.23-20
## [94] gtools_3.9.2 assertthat_0.2.1 rprojroot_2.0.2
## [97] withr_2.4.3 GenomeInfoDbData_1.2.7 hms_1.1.1
## [100] rmarkdown_2.12 lubridate_1.8.0